Librerie:
set.seed(050701)
library(foreign)
library(rms)
library(arm)
library(ResourceSelection)
library(pROC)
library(PRROC)
library(ROCR)
library(readr)
library(dplyr)
library(tidyr)
library(GGally)
library(heatmaply)
library(plotly)
library(ggplot2)
library(gridExtra)
library(ggpubr)
library(RColorBrewer)
library(scales)
library(ggmap)
library(countrycode)
library(regclass)
library(bestglm)
library(OddsPlotty)
Importo e pulisco il dataset:
c_data <- read_csv("Speed Dating Data.csv")
Rows: 8378 Columns: 195
── Column specification ───────────────────────────────────────────────────────────────────────────
Delimiter: ","
chr (4): field, undergra, from, career
dbl (187): iid, id, gender, idg, condtn, wave, round, position, positin1, order, partner, pid, ...
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
#eliminiamo i duplicati
#qui abbiamo il dataset completo
c_data <- c_data %>% group_by(iid) %>% filter (! duplicated(iid))
#features che ci interessano:
data=subset(c_data,select=c(attr,sinc,intel,fun,amb,shar,dec,like,samerace,int_corr,prob,race,gender,age,age_o,income,goal,go_out,date,met))
#voglio aggiungere la variabile d_age con la differenza di età tra l'individuo considerato e il partner
data$d_age=abs(data$age-data$age_o)
#per le features che ci interessano eliminiamo gli na
print(sapply(data,function(x) sum(length(which(is.na(x))))))
attr sinc intel fun amb shar dec like samerace int_corr prob
9 14 14 21 40 69 0 12 0 7 15
race gender age age_o income goal go_out date met d_age
6 0 8 0 270 7 7 8 17 8
#tolgo la variabile income perchè ho molti na
data$income=NULL
data$shar=NULL
#elimino gli na: eliminate 60 osservazioni
data <- na.omit(data)
#decodifico race:
data$race=as.factor(data$race)
levels(data$race)
[1] "1" "2" "3" "4" "6"
data$race=recode(data$race, '1' = 'Black', '2' = 'White','3' = 'Hispanic','4' = 'Asian','6'= 'Other')
#decodifico gender:
data$gender=as.factor(data$gender)
levels(data$gender)
[1] "0" "1"
data$gender=recode(data$gender, '1' = 'Male', '0' = 'Female')
#decodifico goal:
data$goal=as.factor(data$goal)
levels(data$goal)
[1] "1" "2" "3" "4" "5" "6"
data$goal=recode(data$goal, '1'='Fun', '2'='Meet', '3'='Date', '4'= 'Relationship', '5'= 'IdidIt', '6'= 'Other')
#decodifico go_out: (abitudini sociali: quanto escono alla settimana)
data$go_out=as.factor(data$go_out)
levels(data$go_out)
[1] "1" "2" "3" "4" "5" "6" "7"
data$go_out=recode(data$go_out, '1'='Several_pw', '2'='Twice_pw', '3'='Once_pw', '4'= 'Twice_pm', '5'= 'Once_pm', '6'= 'Several_py','7'='Almost_never')
#decodifico date: (abitudini negli appuntamenti: a quanti appuntamenti vanno)
data$date=as.factor(data$date)
levels(data$date)
[1] "1" "2" "3" "4" "5" "6" "7"
data$date=recode(data$date, '1'='Several_pw', '2'='Twice_pw', '3'='Once_pw', '4'= 'Twice_pm', '5'= 'Once_pm', '6'= 'Several_py','7'='Almost_never')
#decodifico samerace
data$samerace=as.factor(data$samerace)
levels(data$samerace)
[1] "0" "1"
data$samerace=recode(data$samerace, '0'='NO', '1'='SI')
Chi sono le persone presenti nel campione che stiamo analizzando:
attach(data)
I seguenti oggetti sono mascherati da data (pos = 3):
age, age_o, amb, attr, d_age, date, dec, fun, gender, go_out, goal, int_corr, intel, like, met, prob, race,
samerace, sinc
I seguenti oggetti sono mascherati da data (pos = 4):
age, age_o, amb, attr, d_age, date, dec, fun, gender, go_out, goal, int_corr, intel, like, met, prob, race,
samerace, sinc
#maschi e femmine
mf_fig=ggplot(data,aes(gender))+geom_bar(aes(fill=gender))+scale_fill_brewer(palette='Pastel1')+theme(legend.position="none")
#età per genere
age_fig=ggplot(data,aes(age))+geom_bar(aes(fill=gender))+scale_fill_brewer(palette='Pastel1')
#etnia
etnia_fig=ggplot(data,aes(race))+geom_bar(aes(fill=race))+theme(legend.position="none")+scale_fill_brewer(palette='Pastel1')
#ABITUDINI, SCOPI
#go_out
goout_fig=ggplot(data,aes(go_out))+geom_bar(aes(fill=go_out))+theme(legend.position="none")+scale_fill_brewer(palette='Pastel1')
#date
date_fig=ggplot(data,aes(date))+geom_bar(aes(fill=date))+theme(legend.position="none")+scale_fill_brewer(palette='Pastel1')
#goal
goal_fig=ggplot(data,aes(goal))+geom_bar(aes(fill=goal))+theme(legend.position="none")+scale_fill_brewer(palette='Pastel1')
#figura complessiva
ggarrange(mf_fig,age_fig,etnia_fig,goout_fig,date_fig,goal_fig)
detach(data)
Rappresentiamo il legame tra le risposte al questionario e la decisione finale dell’individuo:
attach(data)
I seguenti oggetti sono mascherati da data (pos = 3):
age, age_o, amb, attr, d_age, date, dec, fun, gender, go_out, goal, int_corr, intel, like, met, prob, race,
samerace, sinc
I seguenti oggetti sono mascherati da data (pos = 4):
age, age_o, amb, attr, d_age, date, dec, fun, gender, go_out, goal, int_corr, intel, like, met, prob, race,
samerace, sinc
#dec vs attr
y_attr=tapply(dec,attr,mean)
decvsattr_fig=ggplot()+geom_count(aes(attr, dec))+geom_count(aes(sort(unique(attr)), y_attr, colour='red',size=3))+labs(x='attr',y='dec',title='Dec vs Attr')+theme_light()+theme(legend.position="none")
#dec vs sinc
y_sinc=tapply(dec,sinc,mean)
decvssinc_fig=ggplot()+geom_count(data = data, aes(sinc, dec))+geom_count(aes(sort(unique(sinc)), y_sinc,color='red',size=3))+labs(x='sinc',y='dec',title='Dec vs Sinc')+theme_light()+theme(legend.position="none")
#dec vs intel
y_intel=tapply(dec,intel,mean)
decvsintel_fig=ggplot()+geom_count(data = data, aes(intel, dec))+geom_count(aes(sort(unique(intel)), y_intel,color='red',size=3))+labs(x='intel',y='dec',title='Dec vs Intel')+theme_light()+theme(legend.position="none")
#dec vs fun
y_fun=tapply(dec,fun,mean)
decvsfun_fig=ggplot()+geom_count(data = data, aes(fun, dec))+geom_count(aes(sort(unique(fun)), y_fun,color='red',size=3))+labs(x='fun',y='dec',title='Dec vs Fun')+theme_light()+theme(legend.position="none")
#dec vs amb
y_amb=tapply(dec,amb,mean)
decvsamb_fig=ggplot()+geom_count(data = data, aes(amb, dec))+geom_count(aes(sort(unique(amb)), y_amb,color='red',size=3))+labs(x='amb',y='dec',title='Dec vs Amb')+theme_light()+theme(legend.position="none")
#dec vs like
y_like=tapply(dec,like,mean)
decvslike_fig=ggplot()+geom_count(data = data, aes(like, dec))+geom_count(aes(sort(unique(like)), y_like,color='red',size=3))+labs(x='like',y='dec',title='Dec vs Like')+theme_light()+theme(legend.position="none")
#dec vs prob
y_prob=tapply(dec,prob,mean)
decvsprob_fig=ggplot()+geom_count(data = data, aes(prob, dec))+geom_count(aes(sort(unique(prob)), y_prob,color='red',size=3))+labs(x='prob',y='dec',title='Dec vs Prob')+theme_light()+theme(legend.position="none")
#immagine completa
ggarrange(decvsattr_fig,decvssinc_fig,decvsintel_fig,decvsfun_fig,decvsamb_fig,decvslike_fig,decvsprob_fig,nrow=2)
$`1`
$`2`
$`3`
$`4`
attr(,"class")
[1] "list" "ggarrange"
detach(data)
Vediamo ora come influenzano la risposta le variabili categoriche gender, race, samerace, date, goal, go_out:
Valutiamo l’impatto della variabile int_cor sulla decisione degli individui:
attach(data)
I seguenti oggetti sono mascherati da train (pos = 41):
age, age_o, amb, attr, d_age, date, dec, fun, gender, go_out, goal, int_corr, intel,
like, met, prob, race, samerace, sinc
I seguenti oggetti sono mascherati da train (pos = 42):
age, age_o, amb, attr, d_age, date, dec, fun, gender, go_out, goal, int_corr, intel,
like, met, prob, race, samerace, sinc
x=seq(-1,1,0.15)
mid=c((x[2:length(x)]+x[1:(length(x)-1)])/2)
classi=cut(int_corr,breaks=x,include.lowest=TRUE,right=FALSE)
y=tapply(dec,classi,mean)
y
[-1,-0.85) [-0.85,-0.7) [-0.7,-0.55) [-0.55,-0.4) [-0.4,-0.25) [-0.25,-0.1) [-0.1,0.05)
NA NA 1.0000000 0.3333333 0.4347826 0.3469388 0.4259259
[0.05,0.2) [0.2,0.35) [0.35,0.5) [0.5,0.65) [0.65,0.8) [0.8,0.95]
0.4637681 0.4270833 0.4315789 0.4137931 0.2500000 0.0000000
fig=ggplot()+geom_point(aes(int_corr,dec))+geom_point(aes(mid,y,color='red'))
fig
Warning: Removed 2 rows containing missing values (geom_point).
detach(data)
Concludiamo osservando l’andamento della decisione in base all’età dei pratecipanti e alla differenza di età tra partecipanti e partner:
attach(data)
I seguenti oggetti sono mascherati da train (pos = 41):
age, age_o, amb, attr, d_age, date, dec, fun, gender, go_out, goal, int_corr, intel,
like, met, prob, race, samerace, sinc
I seguenti oggetti sono mascherati da train (pos = 42):
age, age_o, amb, attr, d_age, date, dec, fun, gender, go_out, goal, int_corr, intel,
like, met, prob, race, samerace, sinc
x=seq(min(age),max(age),2)
mid=c((x[2:length(x)]+x[1:(length(x)-1)])/2)
classi=cut(age,breaks=x,include.lowest=TRUE,right=FALSE)
y=tapply(dec,classi,mean)
y
[18,20) [20,22) [22,24) [24,26) [26,28) [28,30) [30,32) [32,34) [34,36)
0.3333333 0.6153846 0.4361702 0.4200000 0.3431373 0.4545455 0.3333333 0.4782609 0.1818182
[36,38) [38,40) [40,42) [42,44) [44,46) [46,48) [48,50) [50,52) [52,54]
1.0000000 0.0000000 NA 0.0000000 NA NA NA NA NA
fig=ggplot()+geom_count(aes(age,dec))+geom_point(aes(mid,y,color='red'))
fig
Warning: Removed 6 rows containing missing values (geom_point).
detach(data)
attach(data)
I seguenti oggetti sono mascherati da train (pos = 41):
age, age_o, amb, attr, d_age, date, dec, fun, gender, go_out, goal, int_corr, intel,
like, met, prob, race, samerace, sinc
I seguenti oggetti sono mascherati da train (pos = 42):
age, age_o, amb, attr, d_age, date, dec, fun, gender, go_out, goal, int_corr, intel,
like, met, prob, race, samerace, sinc
x=seq(min(d_age),max(d_age),)
mid=c((x[2:length(x)]+x[1:(length(x)-1)])/2)
classi=cut(d_age,breaks=x,include.lowest=TRUE,right=FALSE)
y=tapply(dec,classi,mean)
y
[0,1) [1,2) [2,3) [3,4) [4,5) [5,6) [6,7) [7,8) [8,9)
0.3750000 0.4404762 0.4864865 0.3888889 0.3548387 0.4042553 0.4146341 0.3571429 0.2000000
[9,10) [10,11) [11,12) [12,13) [13,14) [14,15) [15,16) [16,17) [17,18)
0.5454545 0.3333333 0.3333333 0.6666667 1.0000000 0.5000000 0.5000000 NA NA
[18,19) [19,20) [20,21) [21,22]
0.0000000 NA NA 1.0000000
fig=ggplot()+geom_point(aes(mid,y,color='red'))+geom_count(aes(d_age,dec))+labs(x="d_age",y="dec")
fig
Warning: Removed 4 rows containing missing values (geom_point).
detach(data)
Correlazione tra le varie features:
Prima di iniziare a costruire il modello dividiamo il dataset in training set e test set per la cross-validazione: lavoreremo sul training dataset
smp_size <- floor(0.8 * nrow(data))
## set the seed to make your partition reproducible
train_ind <- sample(seq_len(nrow(data)), size = smp_size)
train <- data[train_ind, ]
test <- data[-train_ind, ]
Fittiamo un modello di regressione logistica per vedere quali variabili sono significative per predire la decisione di un individuo:
attach(train)
I seguenti oggetti sono mascherati da data (pos = 3):
age, age_o, amb, attr, d_age, date, dec, fun, gender, go_out, goal, int_corr, intel, like, met, prob, race,
samerace, sinc
I seguenti oggetti sono mascherati da data (pos = 4):
age, age_o, amb, attr, d_age, date, dec, fun, gender, go_out, goal, int_corr, intel, like, met, prob, race,
samerace, sinc
I seguenti oggetti sono mascherati da data (pos = 5):
age, age_o, amb, attr, d_age, date, dec, fun, gender, go_out, goal, int_corr, intel, like, met, prob, race,
samerace, sinc
I seguenti oggetti sono mascherati da data (pos = 6):
age, age_o, amb, attr, d_age, date, dec, fun, gender, go_out, goal, int_corr, intel, like, met, prob, race,
samerace, sinc
I seguenti oggetti sono mascherati da data (pos = 7):
age, age_o, amb, attr, d_age, date, dec, fun, gender, go_out, goal, int_corr, intel, like, met, prob, race,
samerace, sinc
I seguenti oggetti sono mascherati da data (pos = 8):
age, age_o, amb, attr, d_age, date, dec, fun, gender, go_out, goal, int_corr, intel, like, met, prob, race,
samerace, sinc
I seguenti oggetti sono mascherati da data (pos = 9):
age, age_o, amb, attr, d_age, date, dec, fun, gender, go_out, goal, int_corr, intel, like, met, prob, race,
samerace, sinc
I seguenti oggetti sono mascherati da data (pos = 10):
age, age_o, amb, attr, d_age, date, dec, fun, gender, go_out, goal, int_corr, intel, like, met, prob, race,
samerace, sinc
mod0=glm(dec~attr+sinc+intel+fun+amb+like+samerace+int_corr+prob+race+gender+d_age+met+date+goal+go_out,family=binomial(link="logit"),train)
summary(mod0)
Call:
glm(formula = dec ~ attr + sinc + intel + fun + amb + like +
samerace + int_corr + prob + race + gender + d_age + met +
date + goal + go_out, family = binomial(link = "logit"),
data = train)
Deviance Residuals:
Min 1Q Median 3Q Max
-2.2022 -0.6226 -0.1615 0.6162 3.0090
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -6.980569 1.644606 -4.245 2.19e-05 ***
attr 0.400795 0.119623 3.350 0.000807 ***
sinc -0.313997 0.131192 -2.393 0.016692 *
intel 0.039027 0.160043 0.244 0.807342
fun 0.382924 0.120014 3.191 0.001419 **
amb -0.211164 0.114173 -1.850 0.064385 .
like 0.555237 0.144658 3.838 0.000124 ***
sameraceSI -0.441363 0.331220 -1.333 0.182683
int_corr -0.428969 0.498617 -0.860 0.389614
prob 0.318293 0.080675 3.945 7.97e-05 ***
raceWhite -0.897190 0.676949 -1.325 0.185057
raceHispanic 0.212746 0.849669 0.250 0.802288
raceAsian 0.175952 0.703034 0.250 0.802375
raceOther -0.505070 0.810567 -0.623 0.533215
genderMale 0.632045 0.315055 2.006 0.044841 *
d_age -0.005320 0.050064 -0.106 0.915372
met 0.064430 0.156108 0.413 0.679806
dateTwice_pw 1.692848 1.276525 1.326 0.184794
dateOnce_pw 0.007163 1.114239 0.006 0.994871
dateTwice_pm 0.385804 1.078448 0.358 0.720538
dateOnce_pm 0.040669 1.105836 0.037 0.970663
dateSeveral_py 0.501765 1.094166 0.459 0.646535
dateAlmost_never -0.260916 1.109628 -0.235 0.814102
goalMeet 0.292958 0.342215 0.856 0.391963
goalDate 0.076765 0.619920 0.124 0.901450
goalRelationship 2.514666 0.918707 2.737 0.006197 **
goalIdidIt 0.431130 0.634799 0.679 0.497037
goalOther 0.025158 0.614849 0.041 0.967362
go_outTwice_pw -0.694158 0.379046 -1.831 0.067052 .
go_outOnce_pw -0.109607 0.417724 -0.262 0.793020
go_outTwice_pm -0.269284 0.706746 -0.381 0.703188
go_outOnce_pm -1.352927 1.046267 -1.293 0.195977
go_outSeveral_py -1.198028 1.585128 -0.756 0.449774
go_outAlmost_never -10.418034 790.291377 -0.013 0.989482
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 517.20 on 383 degrees of freedom
Residual deviance: 310.82 on 350 degrees of freedom
AIC: 378.82
Number of Fisher Scoring iterations: 14
modf=step(mod0,direction="both",scope=~attr+sinc+intel+fun+amb+like+samerace+int_corr+prob+race+date+gender+d_age+met+goal+go_out )
Start: AIC=378.82
dec ~ attr + sinc + intel + fun + amb + like + samerace + int_corr +
prob + race + gender + d_age + met + date + goal + go_out
Df Deviance AIC
- go_out 6 315.98 371.98
- date 6 318.61 374.61
- d_age 1 310.83 376.83
- intel 1 310.88 376.88
- met 1 310.99 376.99
- int_corr 1 311.56 377.56
- goal 5 319.84 377.84
- samerace 1 312.61 378.61
<none> 310.82 378.82
- amb 1 314.30 380.30
- gender 1 314.89 380.89
- race 4 320.97 380.97
- sinc 1 316.76 382.76
- fun 1 321.28 387.28
- attr 1 323.22 389.22
- like 1 327.48 393.48
- prob 1 327.52 393.52
Step: AIC=371.98
dec ~ attr + sinc + intel + fun + amb + like + samerace + int_corr +
prob + race + gender + d_age + met + date + goal
Df Deviance AIC
- date 6 325.06 369.06
- intel 1 315.98 369.98
- d_age 1 315.99 369.99
- met 1 316.03 370.03
- goal 5 324.51 370.51
- int_corr 1 316.72 370.72
<none> 315.98 371.98
- race 4 324.44 372.44
- samerace 1 318.53 372.53
- gender 1 318.82 372.82
- amb 1 319.35 373.35
- sinc 1 322.49 376.49
+ go_out 6 310.82 378.82
- fun 1 325.87 379.87
- attr 1 329.11 383.11
- prob 1 331.15 385.15
- like 1 333.92 387.92
Step: AIC=369.06
dec ~ attr + sinc + intel + fun + amb + like + samerace + int_corr +
prob + race + gender + d_age + met + goal
Df Deviance AIC
- met 1 325.06 367.06
- intel 1 325.07 367.07
- d_age 1 325.15 367.15
- int_corr 1 325.50 367.50
- goal 5 334.13 368.13
- race 4 332.76 368.76
<none> 325.06 369.06
- samerace 1 327.31 369.31
- gender 1 328.34 370.34
- amb 1 328.52 370.52
+ date 6 315.98 371.98
- sinc 1 332.28 374.28
+ go_out 6 318.61 374.61
- fun 1 335.22 377.22
- attr 1 339.13 381.13
- prob 1 343.20 385.20
- like 1 344.64 386.64
Step: AIC=367.06
dec ~ attr + sinc + intel + fun + amb + like + samerace + int_corr +
prob + race + gender + d_age + goal
Df Deviance AIC
- intel 1 325.07 365.07
- d_age 1 325.15 365.15
- int_corr 1 325.50 365.50
- goal 5 334.14 366.14
- race 4 332.79 366.79
<none> 325.06 367.06
- samerace 1 327.32 367.32
- gender 1 328.35 368.35
- amb 1 328.53 368.53
+ met 1 325.06 369.06
+ date 6 316.03 370.03
- sinc 1 332.33 372.33
+ go_out 6 318.63 372.63
- fun 1 335.35 375.35
- attr 1 339.14 379.14
- prob 1 343.20 383.20
- like 1 344.64 384.64
Step: AIC=365.07
dec ~ attr + sinc + fun + amb + like + samerace + int_corr +
prob + race + gender + d_age + goal
Df Deviance AIC
- d_age 1 325.16 363.16
- int_corr 1 325.52 363.52
- goal 5 334.18 364.18
- race 4 332.93 364.93
<none> 325.07 365.07
- samerace 1 327.37 365.37
- gender 1 328.39 366.39
+ intel 1 325.06 367.06
+ met 1 325.07 367.07
- amb 1 329.16 367.16
+ date 6 316.03 368.03
+ go_out 6 318.64 370.64
- sinc 1 334.13 372.13
- fun 1 335.51 373.51
- attr 1 339.16 377.16
- prob 1 343.70 381.70
- like 1 345.03 383.03
Step: AIC=363.16
dec ~ attr + sinc + fun + amb + like + samerace + int_corr +
prob + race + gender + goal
Df Deviance AIC
- int_corr 1 325.65 361.65
- goal 5 334.21 362.21
- race 4 333.03 363.03
<none> 325.16 363.16
- samerace 1 327.54 363.54
- gender 1 328.40 364.40
+ d_age 1 325.07 365.07
+ intel 1 325.15 365.15
+ met 1 325.15 365.15
- amb 1 329.39 365.39
+ date 6 316.04 366.04
+ go_out 6 318.66 368.66
- sinc 1 334.13 370.13
- fun 1 335.65 371.65
- attr 1 339.18 375.18
- prob 1 343.71 379.71
- like 1 345.50 381.50
Step: AIC=361.65
dec ~ attr + sinc + fun + amb + like + samerace + prob + race +
gender + goal
Df Deviance AIC
- goal 5 334.80 360.80
<none> 325.65 361.65
- race 4 333.76 361.76
- samerace 1 327.83 361.83
- gender 1 329.04 363.04
+ int_corr 1 325.16 363.16
+ d_age 1 325.52 363.52
+ intel 1 325.63 363.63
+ met 1 325.65 363.65
- amb 1 330.19 364.19
+ date 6 316.83 364.83
+ go_out 6 319.26 367.26
- sinc 1 334.48 368.48
- fun 1 336.50 370.50
- attr 1 340.13 374.13
- prob 1 343.97 377.97
- like 1 345.82 379.82
Step: AIC=360.8
dec ~ attr + sinc + fun + amb + like + samerace + prob + race +
gender
Df Deviance AIC
- race 4 341.62 359.62
<none> 334.80 360.80
- samerace 1 337.17 361.17
+ goal 5 325.65 361.65
- gender 1 338.03 362.03
+ int_corr 1 334.21 362.21
+ intel 1 334.73 362.73
+ d_age 1 334.74 362.74
+ met 1 334.80 362.80
- amb 1 339.03 363.03
+ date 6 325.41 363.41
+ go_out 6 328.66 366.66
- fun 1 343.75 367.75
- sinc 1 344.57 368.57
- attr 1 350.46 374.46
- like 1 354.09 378.09
- prob 1 355.28 379.28
Step: AIC=359.62
dec ~ attr + sinc + fun + amb + like + samerace + prob + gender
Df Deviance AIC
<none> 341.62 359.62
- gender 1 343.92 359.92
+ int_corr 1 340.80 360.80
+ race 4 334.80 360.80
+ intel 1 341.35 361.35
+ d_age 1 341.56 361.56
+ met 1 341.58 361.58
+ goal 5 333.76 361.76
- samerace 1 345.91 361.91
- amb 1 346.01 362.01
+ date 6 332.96 362.96
- fun 1 350.17 366.17
+ go_out 6 337.13 367.13
- sinc 1 352.04 368.04
- attr 1 357.66 373.66
- like 1 360.86 376.86
- prob 1 361.32 377.32
summary(modf)
Call:
glm(formula = dec ~ attr + sinc + fun + amb + like + samerace +
prob + gender, family = binomial(link = "logit"), data = train)
Deviance Residuals:
Min 1Q Median 3Q Max
-2.0832 -0.7499 -0.2127 0.7489 2.7988
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -6.13455 0.89677 -6.841 7.88e-12 ***
attr 0.41123 0.10823 3.800 0.000145 ***
sinc -0.35148 0.11119 -3.161 0.001572 **
fun 0.30576 0.10636 2.875 0.004044 **
amb -0.20185 0.09788 -2.062 0.039191 *
like 0.55065 0.13248 4.157 3.23e-05 ***
sameraceSI -0.58016 0.28278 -2.052 0.040203 *
prob 0.31221 0.07280 4.288 1.80e-05 ***
genderMale 0.41572 0.27422 1.516 0.129521
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 517.20 on 383 degrees of freedom
Residual deviance: 341.62 on 375 degrees of freedom
AIC: 359.62
Number of Fisher Scoring iterations: 5
vif(modf)
attr sinc fun amb like sameraceSI prob genderMale
1.522973 1.645558 1.602007 1.553870 1.716699 1.051078 1.095150 1.052401
detach(train)
Costruiamo il classificatore:
soglia=0.5
valori_reali=train$dec
valori_predetti=as.numeric(modf$fitted.values>soglia)
tab=table(valori_reali,valori_predetti)
tab
valori_predetti
valori_reali 0 1
0 172 48
1 52 112
accuratezza = sum(diag(tab))/sum(tab)
accuratezza
[1] 0.7395833
specificita = tab[1,1]/(tab[1,1]+tab[1,2])
specificita
[1] 0.7818182
FPR=1-specificita
sensitivita = tab[2,2]/(tab[2,1]+tab[2,2])
sensitivita
[1] 0.6829268
fit=modf$fitted
PRROC_obj <- roc.curve(scores.class0 = fit, weights.class0=as.numeric(paste(train$dec)),
curve=TRUE)
plot(PRROC_obj)
points(FPR,sensitivita,pch=4,lwd=3,cex=1.5,col='blue')
#trovare la soglia ottima: sembrerebbe essere 0.317
mycurve = roc(train$dec,modf$fitted.values)
Setting levels: control = 0, case = 1
Setting direction: controls < cases
plot(mycurve,print.thres=TRUE)
#ricalcolo le tabelle di misclassificazione
soglia=0.369
valori_reali=train$dec
valori_predetti=as.numeric(modf$fitted.values>soglia)
tab=table(valori_reali,valori_predetti)
tab
valori_predetti
valori_reali 0 1
0 156 64
1 31 133
accuratezza = sum(diag(tab))/sum(tab)
accuratezza
[1] 0.7526042
specificita = tab[1,1]/(tab[1,1]+tab[1,2])
specificita
[1] 0.7090909
FPR=1-specificita
sensitivita = tab[2,2]/(tab[2,1]+tab[2,2])
sensitivita
[1] 0.8109756
Check collinearità :
vif(modf)
attr sinc fun amb like sameraceSI prob intel
1.443596 1.839755 1.759013 1.733104 1.782484 1.039053 1.111862 2.165017
Verifichiamo che il modello ridotto non sia meno informativo del modello completo iniziale con un test anova:
anova(modf,mod0,test="Chisq")
Analysis of Deviance Table
Model 1: dec ~ attr + sinc + fun + amb + like + samerace + prob + intel
Model 2: dec ~ attr + sinc + intel + fun + amb + like + samerace + int_corr +
prob + race + gender + d_age + met + date + goal + go_out
Resid. Df Resid. Dev Df Deviance Pr(>Chi)
1 375 370.0
2 350 339.2 25 30.805 0.1956
Test di Hosmer e Lemeshow per valutare GOF modello:
hoslem.test(modf$y,fitted(modf),g=13)
Hosmer and Lemeshow goodness of fit (GOF) test
data: modf$y, fitted(modf)
X-squared = 11.701, df = 11, p-value = 0.3865
dim(model.matrix(modf))
[1] 384 9
Odds ratio per interpretare i coefficienti:
library(OddsPlotty)
plotty=odds_plot(modf)
In attesa che venga eseguita la profilazione...
plotty$odds_plot
Testiamo il modello fittato sul test set:
predTest <- predict(modf, test, type="response")
soglia=0.317 # threshold for categorizing predicted probabilities
predFac <- cut(predTest, breaks=c(-Inf, soglia, Inf), labels=c('0', '1'))
Tab <- table(test$dec, predFac, dnn=c("actual", "predicted"))
Tab
predicted
actual 0 1
0 47 14
1 3 32
accuratezza = sum(diag(Tab))/sum(Tab)
accuratezza
[1] 0.8229167
specificita = Tab[1,1]/(Tab[1,1]+Tab[1,2])
specificita
[1] 0.7704918
FPR=1-specificita
sensitivita = Tab[2,2]/(Tab[2,1]+Tab[2,2])
sensitivita
[1] 0.9142857
fit=modf$fitted
PRROC_obj <- roc.curve(scores.class0 = fit, weights.class0=as.numeric(paste(train$dec)),
curve=TRUE)
plot(PRROC_obj)